import_projection <- function(proj_dir, proj_name){
# Import a 2D projection -- this is used for visualization
proj_coords <- read.csv(paste(proj_dir, proj_name, sep="/"), header=F, sep=" ")
colnames(proj_coords) = c("x1","x2")
return(proj_coords)
}
import_clusters <- function(clust_dir, clust_name){
# Import cluster labels.
clust_labels <- read.csv(paste(clust_dir, clust_name, sep="/"), header=F)
colnames(clust_labels) <- "cluster"
clust_labels$cluster <- factor(clust_labels$cluster)
return(clust_labels)
}
import_aux <- function(aux_dir, aux_file){
# Import auxiliary data (in this case, population labels)
samples <- read.csv(paste(aux_dir, aux_file, sep="/"), sep="\t")
samples$sample <- as.character(samples$sample)
samples <- samples[,-c(3)]
return(samples)
}
import_descriptions <- function(aux_dir, aux_file){
# Import population descriptions by population code
temp <- read.csv(paste(aux_dir, aux_file, sep="/"), sep="\t")
# we only want the two relevant columns
# last row is "Total"
temp <- temp[1:nrow(temp)-1,c(1,2)]
temp$Population.Description <- as.character(temp$Population.Description)
return(temp)
}
import_ids <- function(id_dir, id_file){
# Import the 1000GP IDs as ordered in the original VCF
ids <- read.csv(paste(id_dir, id_file, sep="/"), sep = " ", header=F)
return(ids)
}
get_cluster_means <- function(in_data, dim1, dim2, cluster_var){
# Calculate the mean coordinate of a population variable (e.g. cluster)
cluster_coords <- in_data[,which(colnames(in_data) %in% c(cluster_var, dim1, dim2))]
coord_means_1 <- cluster_coords %>%
group_by(!! sym(cluster_var)) %>%
summarize(x1 = mean(!! sym(dim1)))
coord_means_2 <- cluster_coords %>%
group_by(!! sym(cluster_var)) %>%
summarize(x2 = mean(!! sym(dim2)))
coord_means <- left_join(coord_means_1, coord_means_2, by = c(cluster_var))
return(coord_means)
}
proj_dir <-"data"
proj_name <- "1000G_UMAP_PC100_NC2_NN15_MD0.5_admixturenotebook"
cluster_dir <- "data"
aux_dir <- "data/"
aux_file <- "affy_samples.20141118.panel"
label_file <- "20131219.populations.tsv"
id_file <- "1KGP_ids.txt"
coords <- import_projection(proj_dir, proj_name)
samples <- import_aux(aux_dir, aux_file)
descriptions <- import_descriptions(aux_dir, label_file)
ids <- import_ids(aux_dir, id_file)
s <- 0.1
a <- 0.4
f <- "hdbscan_labels_min25_EPS0.5_1000G_UMAP_PC16_NC5_NN50_MD0.01_euclidean_2019814225811"
clusters <- import_clusters(cluster_dir, f)
main_data <- data.frame(cbind(ids, coords))
colnames(main_data)[1] <- "ID"
main_data$ID <- as.character(main_data$ID)
main_data <- main_data[,-c(2)]
# add population labels
main_data <- merge(main_data, samples, by.x = "ID", by.y = "sample")
# add cluster labels
main_data$cluster <- clusters$cluster
pop_means <- get_cluster_means(main_data, "x1", "x2", "pop")
# Note that in this 2D visualization, the ITU and GIH populations are split into multiple clusters
pop_means <- subset(pop_means, !(pop %in% c("ITU","GIH")))
pop_means <- rbind.data.frame(pop_means, list("ITU",-7,5))
pop_means <- rbind.data.frame(pop_means, list("ITU",15,16))
pop_means <- rbind.data.frame(pop_means, list("GIH",-7,5))
pop_means <- rbind.data.frame(pop_means, list("GIH",2,15))
ggplot(main_data, aes(x=x1, y=x2, colour=pop)) +
geom_point(size=s, alpha=a) +
scale_color_manual(name = "pop", values=pal) +
ggtitle("Thousand Genomes, coloured by population") +
xlab("UMAP1") + ylab("UMAP2") + theme_bw() +
theme(legend.position = "none") +
geom_label_repel(data = pop_means,
aes(x=x1, y=x2, label=pop, colour = as.factor(pop)), size=5, alpha=0.6, label.size=0.05)

# In this visualization, cluster 16 (mostly ITU individuals) is split into multiple clusters
cluster_means <- get_cluster_means(main_data, "x1", "x2", "cluster")
cluster_means <- subset(cluster_means, cluster!=16)
cluster_means_extra <- cluster_means[FALSE,]
cluster_means_extra <- rbind.data.frame(cluster_means_extra, data.frame(cluster="16",x1=-4.3,x2=5))
cluster_means_extra <- rbind.data.frame(cluster_means_extra, data.frame(cluster="16",x1=15,x2=14))
# Colour palette
colourCount <- length(unique(main_data$cluster))
getPalette = colorRampPalette(brewer.pal(9, "Set1"))
colpal <- colorRampPalette(brewer.pal(8, "Set1"))(colourCount)
# manually change some colours:
# 14 is too yellow
colpal[15] <- "#FFBC2D"
# randomly permute the colours for clarity.
# sequential clusters are often (1) next to each other and (2) similarly coloured.
colpal <- sample(colpal)
# Some colours too similar:
# 11 and 12, 9 and 10,
ggplot(main_data, aes(x=x1, y=x2, colour=cluster)) +#, shape=cluster)) +
geom_point(size=s, alpha=a) +
scale_color_manual(values = colpal) +
geom_label_repel(data = cluster_means,
aes(x=x1, y=x2, label=cluster, colour = as.factor(cluster)), size=5, alpha=0.6) +
geom_label(data = cluster_means_extra,
aes(x=x1, y=x2, label=cluster, colour = as.factor(cluster)), size=5, alpha=0.6) +
ggtitle("Thousand Genomes, coloured by clustering algorithm") +
xlab("UMAP1") + ylab("UMAP2") + theme_bw() +
theme(legend.position = "none")

Interactive plots
s <- 2
main_data <- data.frame(cbind(ids, coords))
colnames(main_data)[1] <- "ID"
main_data$ID <- as.character(main_data$ID)
main_data <- main_data[,-c(2)]
# add population labels
main_data <- merge(main_data, samples, by.x = "ID", by.y = "sample")
# add cluster labels
main_data$cluster <- clusters$cluster
# unassigned boolean
# The clustering for this demo has been selected to not have any unclustered individuals
main_data$unassigned <- ifelse(main_data$cluster==-1,"unassigned","assigned")
colourCount <- length(unique(main_data$cluster))
getPalette = colorRampPalette(brewer.pal(9, "Set1"))
fig_main <- plot_ly(data = main_data, x = ~x1, y = ~x2,
marker = list(size=s)) %>%
add_markers(color = ~pop, colors = pal,
text = ~paste( "ID", ID, "\nCluster", cluster),
type = "scattergl", mode = "markers")
fig_main
fig_clusters <- plot_ly(data = main_data, x = ~x1, y = ~x2,
marker = list(size=s)) %>%
add_markers(color = ~cluster, colors = getPalette(colourCount),
text = ~paste( "ID", ID, "\nPopulation", pop),
type = "scattergl", mode = "markers")
fig_clusters
print_pop_table <- function(p, in_data){
# Print the cluster membership of a specified population
cat(paste(p, "below"))
temp_data <- in_data %>% filter(pop==p)
print(table(temp_data$cluster))
cat("Proportion of population in largest cluster:")
print(max(table(temp_data$cluster))/sum(table(temp_data$cluster)))
cat("\n")
}
for (p in unique(main_data$pop)){
print_pop_table(p, main_data)
}
## GBR below
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 0 0 0 0 105 0 0 0 0 0 0 0 0 0 0 0 0 0
## 18 19 20
## 0 0 0
## Proportion of population in largest cluster:[1] 1
##
## FIN below
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 104 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 18 19 20
## 0 0 0
## Proportion of population in largest cluster:[1] 1
##
## CHS below
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 171
## 18 19 20
## 0 0 0
## Proportion of population in largest cluster:[1] 1
##
## PUR below
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 0 0 145 0 0 0 4 0 0 0 0 0 0 0 0 0 0 0
## 18 19 20
## 0 0 0
## Proportion of population in largest cluster:[1] 0.9731544
##
## CDX below
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 18 19 20
## 104 0 0
## Proportion of population in largest cluster:[1] 0.9904762
##
## CLM below
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 0 0 0 0 0 0 0 0 0 0 0 4 142 0 0 0 0 0
## 18 19 20
## 0 0 0
## Proportion of population in largest cluster:[1] 0.9726027
##
## IBS below
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 0 0 0 162 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 18 19 20
## 0 0 0
## Proportion of population in largest cluster:[1] 1
##
## PEL below
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 0 0 0 0 0 0 0 128 0 0 0 0 1 0 0 0 0 0
## 18 19 20
## 0 0 0
## Proportion of population in largest cluster:[1] 0.9922481
##
## PJL below
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 48 95 12 0 0 0 0
## Proportion of population in largest cluster:[1] 0.6129032
##
## KHV below
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3
## 18 19 20
## 118 0 0
## Proportion of population in largest cluster:[1] 0.9752066
##
## ACB below
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 0 0 0 0 0 0 122 0 0 0 0 0 0 0 0 0 0 0
## 18 19 20
## 0 0 0
## Proportion of population in largest cluster:[1] 1
##
## GWD below
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 18 19 20
## 0 179 1
## Proportion of population in largest cluster:[1] 0.9944444
##
## ESN below
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 0 0 0 0 0 0 0 0 0 172 0 0 0 0 0 0 0 0
## 18 19 20
## 0 0 0
## Proportion of population in largest cluster:[1] 1
##
## BEB below
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 0 0 0 0 0 0 0 0 133 0 0 0 0 0 0 0 10 0
## 18 19 20
## 0 0 0
## Proportion of population in largest cluster:[1] 0.9300699
##
## MSL below
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 18 19 20
## 0 0 122
## Proportion of population in largest cluster:[1] 1
##
## STU below
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 124 0
## 18 19 20
## 0 0 0
## Proportion of population in largest cluster:[1] 0.96875
##
## ITU below
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 9 109 0
## 18 19 20
## 0 0 0
## Proportion of population in largest cluster:[1] 0.9237288
##
## CEU below
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 0 0 0 0 183 0 0 0 0 0 0 0 0 0 0 0 0 0
## 18 19 20
## 0 0 0
## Proportion of population in largest cluster:[1] 1
##
## YRI below
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 0 0 0 0 0 0 0 0 0 1 181 0 0 0 0 0 0 0
## 18 19 20
## 0 0 0
## Proportion of population in largest cluster:[1] 0.9945055
##
## CHB below
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 105
## 18 19 20
## 0 0 0
## Proportion of population in largest cluster:[1] 1
##
## JPT below
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 0 0 0 0 0 0 0 0 0 0 0 0 0 104 0 0 0 1
## 18 19 20
## 0 0 0
## Proportion of population in largest cluster:[1] 0.9904762
##
## LWK below
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 0 110 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 18 19 20
## 0 0 0
## Proportion of population in largest cluster:[1] 1
##
## ASW below
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 0 0 0 0 0 0 103 0 0 0 0 1 3 0 0 0 0 0
## 18 19 20
## 0 0 0
## Proportion of population in largest cluster:[1] 0.9626168
##
## MXL below
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 0 0 0 7 0 0 0 0 0 0 0 97 0 0 0 0 0 0 0 0 0
## Proportion of population in largest cluster:[1] 0.9326923
##
## TSI below
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 0 0 0 111 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 18 19 20
## 0 0 0
## Proportion of population in largest cluster:[1] 1
##
## GIH below
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 0 0 0 0 0 69 0 0 0 0 0 0 0 0 0 41 1 0 0 0 0
## Proportion of population in largest cluster:[1] 0.6216216